Package org.terrier.structures.merging

Source Code of org.terrier.structures.merging.LexiconMerger

/*
* Terrier - Terabyte Retriever
* Webpage: http://terrier.org
* Contact: terrier{a.}dcs.gla.ac.uk
* University of Glasgow - School of Computing Science
* http://www.gla.ac.uk/
*
* The contents of this file are subject to the Mozilla Public License
* Version 1.1 (the "License"); you may not use this file except in
* compliance with the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS"
* basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See
* the License for the specific language governing rights and limitations
* under the License.
*
* The Original Code is BlockDirectIndex.java.
*
* The Original Code is Copyright (C) 2004-2011 the University of Glasgow.
* All Rights Reserved.
*
* Contributor(s):
*   Ben He <ben{a.}dcs.gla.ac.uk> (original author)
*   Craig Macdonald <craigm{a.}dcs.gla.ac.uk>
*/

package org.terrier.structures.merging;

import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;

import org.apache.log4j.Logger;
import org.terrier.structures.FSOMapFileLexiconOutputStream;
import org.terrier.structures.Index;
import org.terrier.structures.IndexUtil;
import org.terrier.structures.LexiconEntry;
import org.terrier.structures.LexiconOutputStream;
import org.terrier.structures.Pointer;
import org.terrier.structures.SimpleBitIndexPointer;
import org.terrier.structures.indexing.LexiconBuilder;
import org.terrier.structures.seralization.FixedSizeWriteableFactory;

/**
* Merges many lexicons, termids and offsets are not kept.
* @author vassilis
*/
public class LexiconMerger {

  /** The logger used */
  private static Logger logger = Logger.getLogger(LexiconMerger.class);

  protected Index srcIndex1;
  protected Index srcIndex2;
  protected Index destIndex;

 
  /**
   * A constructor that sets the filenames of the lexicon
   * files to merge
   * @param src1 Source index 1
   * @param src2 Source index 2
   * @param dest Destination index
   */
  public LexiconMerger(Index src1, Index src2, Index dest)
  {
    srcIndex1 = src1;
    srcIndex2 = src2;
    destIndex = dest;
  }
 
  /**
   * Merges the two lexicons into one. After this stage, the offsets in the
   * lexicon are not correct. They will be updated only after creating the
   * inverted file.
   */
  @SuppressWarnings("unchecked")
  public void mergeLexicons() {
    try {
     
      //setting the input streams
      Iterator<Map.Entry<String,LexiconEntry>> lexInStream1 =
        (Iterator<Map.Entry<String,LexiconEntry>>)srcIndex1.getIndexStructureInputStream("lexicon");
      Iterator<Map.Entry<String,LexiconEntry>> lexInStream2 =
        (Iterator<Map.Entry<String,LexiconEntry>>)srcIndex2.getIndexStructureInputStream("lexicon");
     
      for(String structure : new String[]{"lexicon-keyfactory", "lexicon-valuefactory"})
      {
        IndexUtil.copyStructure(srcIndex1, destIndex, structure, structure);
      }
      for(String property : new String[] {"max.term.length", "index.inverted.fields.count"} )
      {
        destIndex.setIndexProperty(property, srcIndex1.getIndexProperty(property, null));
      }
     
      //setting the output stream
      LexiconOutputStream<String> lexOutStream = new FSOMapFileLexiconOutputStream(
          destIndex,
          "lexicon",
           (Class <FixedSizeWriteableFactory<LexiconEntry>>)destIndex.getIndexStructure("lexicon-valuefactory").getClass()
          );
     
      boolean hasMore1 = false;
      boolean hasMore2 = false;
      String term1;
      String term2;

      int termId = 0;
     
      Pointer p = new SimpleBitIndexPointer();
   
      hasMore1 = lexInStream1.hasNext();
      hasMore2 = lexInStream2.hasNext();
      Map.Entry<String,LexiconEntry> lee1 = lexInStream1.next();
      Map.Entry<String,LexiconEntry> lee2 = lexInStream2.next();
      while (hasMore1 && hasMore2) {
       
       
       
        term1 = lee1.getKey();
        term2 = lee2.getKey();
        int lexicographicalCompare = term1.compareTo(term2);
        if (lexicographicalCompare < 0) {
          lee1.getValue().setTermId(termId);
          lee1.getValue().setPointer(p);
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          termId++;
          if (hasMore1 = lexInStream1.hasNext()) lee1 = lexInStream1.next();
       
        } else if (lexicographicalCompare > 0) {
          lee2.getValue().setTermId(termId);
          lee2.getValue().setPointer(p);
          lexOutStream.writeNextEntry(term2, lee2.getValue());
          termId++;
          if (hasMore2 = lexInStream2.hasNext()) lee2 = lexInStream2.next();
        } else {
          lee1.getValue().setTermId(termId);
          lee1.getValue().setPointer(p);
          lee1.getValue().add(lee2.getValue());
          lexOutStream.writeNextEntry(term1, lee1.getValue());
          if (hasMore1 = lexInStream1.hasNext()) lee1 = lexInStream1.next();
          if (hasMore2 = lexInStream2.hasNext()) lee2 = lexInStream2.next();
          termId++;
        }
      }
     
      if (hasMore1) {
        while (hasMore1) {
          lee1.getValue().setTermId(termId);
          lee1.getValue().setPointer(p);
          lexOutStream.writeNextEntry(lee1.getKey(), lee1.getValue());
          if (hasMore1 = lexInStream1.hasNext()) lee1 = lexInStream1.next();
          termId++;
        }
      } else if (hasMore2) {
        while (hasMore2) {
          lee1.getValue().setTermId(termId);
          lee1.getValue().setPointer(p);
          lexOutStream.writeNextEntry(lee2.getKey(), lee2.getValue());
          if (hasMore2 = lexInStream2.hasNext()) lee2 = lexInStream2.next();
          termId++;
        }   
      }
      IndexUtil.close(lexInStream1);
      IndexUtil.close(lexInStream2);
      lexOutStream.close();
      //recopy the value factory to ensure the field settings are correct
      for(String structure : new String[]{"lexicon-valuefactory"})
      {
        IndexUtil.copyStructure(srcIndex1, destIndex, structure, structure);
      }
     
      LexiconBuilder.optimise(destIndex, "lexicon");
      destIndex.flush();
    } catch(IOException ioe) {
      logger.error("IOException while merging lexicons.", ioe);
    }
  }
  /**
   * main
   * @param args
   * @throws Exception
   */
  public static void main(String[] args) throws Exception {

    if (args.length != 6)
    {
      logger.fatal("usage: java org.terrier.structures.merging.LexiconMerger srcPath1 srcPrefix1 srcPath2 srcPrefix2 destPath1 destPrefix1 ");
      return;
    }
    Index.setIndexLoadingProfileAsRetrieval(false);
   
    Index indexSrc1 = Index.createIndex(args[0], args[1]);
    Index indexSrc2 = Index.createIndex(args[2], args[3]);
    Index indexDest = Index.createNewIndex(args[4], args[5]);

    LexiconMerger lMerger = new LexiconMerger(indexSrc1, indexSrc2, indexDest);
    long start = System.currentTimeMillis();
    if(logger.isInfoEnabled()){
      //logger.info("started at " + (new Date()));
    }
    lMerger.mergeLexicons();
    indexSrc1.close();
    indexSrc2.close();
    indexDest.close();

    if(logger.isInfoEnabled()){
      //logger.info("finished at " + (new Date()));
      long end = System.currentTimeMillis();
      //logger.info("time elapsed: " + ((end-start)*1.0d/1000.0d) + " sec.");
    }
  }

 
}
TOP

Related Classes of org.terrier.structures.merging.LexiconMerger

TOP
Copyright © 2018 www.massapi.com. All rights reserved.
All source code are property of their respective owners. Java is a trademark of Sun Microsystems, Inc and owned by ORACLE Inc. Contact coftware#gmail.com.